Donald Kane

November 8, 2021

require(dplyr)
require(data.table)
library(EnvStats)
d1 <- fread("https://biostat.app.vumc.org/wiki/pub/Main/DataSets/nhgh.tsv") %>% 
  as.data.frame
d1 <- d1 %>% 
  filter(sex == "female") %>% 
  filter(age >= 18) %>% 
  select(gh, ht) %>% 
  filter(1:n()<=1000)

Method of Moments

Normal Distribution

Estimates of parameters

# Height
xbarh <- mean(d1$ht)
xbarh
## [1] 160.7419
stdevh <- sd(d1$ht)
stdevh
## [1] 7.320161
# Glycohemoglobin
xbarg <- mean(d1$gh)
xbarg
## [1] 5.7246
stdevg <- sd(d1$gh)
stdevg
## [1] 1.052246

Overlay Estimated PDF onto Histogram

hist(d1$ht,
     freq = FALSE,
     main = "Normal Height MM",
     breaks = 100)
curve(dnorm(x,
            mean = xbarh,
            sd = stdevh),
      add = TRUE,
      col = "blue",
      lwd = 3,
      main = "Normal Height MM")

hist(d1$gh,
     freq = FALSE,
     main = "Normal Glycohemoglobin MM",
     breaks = 100)
curve(dnorm(x,
            mean = xbarg,
            sd = stdevg),
      add = TRUE,
      col = "red",
      lwd = 3,
      main = "Normal Glycohemoglobin MM")

Overlay Estimated CDF onto ECDF

plot(ecdf(d1$ht),
     main = "Normal Height MM")
curve(pnorm(x,
            mean = xbarh,
            sd = stdevh),
      add = TRUE,
      col = "blue",
      lwd = 3,
      main = "Normal Height MM")

plot(ecdf(d1$gh),
     main = "Normal Glycohemoglobin MM")
curve(pnorm(x,
            mean = xbarg,
            sd = stdevg),
      add = TRUE,
      col = "red",
      lwd = 3,
      main = "Normal Glycohemoglobin MM")

QQ Plot

q_values <- seq(0.05, 0.95, length = 100)

sampleh <- quantile(d1$ht, q_values)
theoreticalh <- qnorm(q_values, mean = xbarh, 
            sd = stdevh)
plot(sampleh, theoreticalh, pch = 16, main = "Q-Q Plot Normal Heights (MM)")
abline(0,1, col = "blue", lwd = 2)

sampleg <- quantile(d1$gh, q_values)
theoreticalg <- qnorm(q_values, mean = xbarg, 
            sd = stdevg)
plot(sampleg, theoreticalg, pch = 16, main = "Q-Q Plot Normal Glycohemoglobin (MM)")
abline(0,1, col = "red", lwd = 2)

Estimated Median

M <- 5000
N <- 1000

simh <- rnorm(N*M, mean = xbarh, 
            sd = stdevh) %>% array(dim = c(M,N))
sample_dist <- apply(simh, 1, median)
hist(sample_dist, breaks = 50, main = "Normal Distribution of Medians \nHeights (MM)", xlab = "Median Height")
abline(v = qnorm(.5, mean = xbarh, sd = stdevh),
       lwd = 3, col = "blue")

qnorm(.5, mean = xbarh, sd = stdevh)
## [1] 160.7419
simg <- rnorm(N*M, mean = xbarg, 
            sd = stdevg) %>% array(dim = c(M,N))
sample_dist <- apply(simg, 1, median)
hist(sample_dist, breaks = 50, main = "Normal Distribution of Medians \nGlycohemoglobin (MM)", xlab = "Median Glycohemoglobin")
abline(v = qnorm(.5, mean = xbarg, sd = stdevg),
       lwd = 3, col = "red")

qnorm(.5, mean = xbarg, sd = stdevg)
## [1] 5.7246

Gamma Distribution

Estimates of parameters

# Height
xbarh <- mean(d1$ht)
stdevh <- var(d1$ht)
shape_hath <- xbarh^2/stdevh
scale_hath <- stdevh/xbarh

# Glycohemoglobin
xbarg <- mean(d1$gh)
stdevg <- var(d1$gh)
shape_hatg <- xbarg^2/stdevg
scale_hatg <- stdevg/xbarg

Overlay Estimated PDF onto Histogram

hist(d1$ht,
     freq = FALSE,
     main = "Gamma Height MM",
     breaks = 75)
curve(dgamma(x, shape = shape_hath, 
            scale = scale_hath),
      add = TRUE,
      col = "blue",
      lwd = 3,
      main = "Gamma Height MM")

hist(d1$gh,
     freq = FALSE,
     main = "Gamma Glycohemoglobin MM",
     breaks = 75)
curve(dgamma(x, shape = shape_hatg, 
            scale = scale_hatg),
      add = TRUE,
      col = "red",
      lwd = 3,
      main = "Gamma Glycohemoglobin MM")

Overlay Estimated CDF onto ECDF

plot(ecdf(d1$ht),
     main = "Gamma Height MM")
curve(pgamma(x,
            shape = shape_hath,
            scale = scale_hath),
      add = TRUE,
      col = "blue",
      lwd = 3,
      main = "Gamma Height MM")

plot(ecdf(d1$gh),
     main = "Gamma Glycohemoglobin MM")
curve(pgamma(x,
            shape = shape_hatg,
            scale = scale_hatg),
      add = TRUE,
      col = "red",
      lwd = 3,
      main = "Gamma Glycohemoglobin MM")

QQ Plot

q_values <- seq(0.01, 0.99, length = 99)

sampleh <- quantile(d1$ht, q_values)
theoreticalh <- qgamma(q_values,
                       shape = shape_hath,
                       scale = scale_hath)
plot(sampleh, theoreticalh, pch = 16, main = "Q-Q Plot Gamma Heights (MM)")
abline(0,1, col = "blue", lwd = 2)

sampleg <- quantile(d1$gh, q_values)
theoreticalg <- qgamma(q_values, 
                       shape = shape_hatg, 
                       scale = scale_hatg)
plot(sampleg, theoreticalg, pch = 16, main = "Q-Q Plot Gamma Glycohemoglobin (MM)")
abline(0,1, col = "red", lwd = 2)

Estimated Median

M <- 5000
N <- 1000

simh <- rgamma(N*M, scale = scale_hath, 
            shape = shape_hath) %>% array(dim = c(M,N))
sample_dist <- apply(simh, 1, median)
hist(sample_dist, breaks = 50, main = "Gamma Distribution of Medians \nHeights (MM)", xlab = "Median Height")
abline(v = qgamma(.5, scale = scale_hath, shape = shape_hath),
       lwd = 3, col = "blue")

qgamma(.5, shape = shape_hath, scale = scale_hath)
## [1] 160.6308
simg <- rgamma(N*M, scale = scale_hatg, shape = shape_hatg) %>% 
  array(dim = c(M,N))
sample_dist <- apply(simg, 1, median)
hist(sample_dist, breaks = 50, main = "Gamma Distribution of Medians \nGlycohemoglobin (MM)", xlab = "Median Glycohemoglobin")
abline(v = qgamma(.5, scale = scale_hatg, shape = shape_hatg),
       lwd = 3, col = "red")

qgamma(.5, scale = scale_hatg, shape = shape_hatg)
## [1] 5.660259

Weibull Distribution

Estimates of parameters

wht <- eweibull(d1$ht, method = "mme")
wht$parameters[1]
##    shape 
## 27.47348
wht$parameters[2]
##    scale 
## 163.9791
wgh <- eweibull(d1$gh, method = "mme")
wgh$parameters[1]
##    shape 
## 6.356614
wgh$parameters[2]
##    scale 
## 6.151127

Overlay Estimated PDF onto Histogram

hist(d1$ht, breaks = 100, freq = FALSE, main = "Weibull Height (MM)", xlab = "Height")
curve(dweibull(x, shape = wht$parameters[1], 
            scale = wht$parameters[2]), 
            col = "blue", 
            lwd = 6,
            add = TRUE)

hist(d1$gh, breaks = 100, freq = FALSE, main = "Weibull Glycohemoglobin (MM)", xlab = "Glycohemoglobin")
curve(dweibull(x, shape = wgh$parameters[1], 
            scale = wgh$parameters[2]), 
            col = "red", 
            lwd = 6,
            add = TRUE)

Overlay Estimated CDF onto ECDF

plot(ecdf(d1$ht),main = "Weibull Height MM", xlab = "Height")
curve(pweibull(
  x, 
  shape = wht$parameters[1], 
  scale = wht$parameters[2] 
), 
add = TRUE, col = "blue", lwd = 2)

plot(ecdf(d1$gh),main = "Weibull Glychohemoglobin MM", xlab = "Height")
curve(pweibull(
  x, 
  shape = wgh$parameters[1], 
  scale = wgh$parameters[2] 
), 
add = TRUE, col = "red", lwd = 2)

QQ Plot

q_values <- seq(0.05, 0.95, length = 100)

sampleh <- quantile(d1$ht, q_values)
theoreticalh <- qweibull(q_values, 
  shape = wht$parameters[1], 
  scale = wht$parameters[2] 
)
plot(sampleh, theoreticalh, pch = 16, main = "Q-Q Plot Weibull Heights (MM)")
abline(0,1, col = "blue", lwd = 2)

sampleg <- quantile(d1$gh, q_values)
theoreticalg <- qweibull(q_values, 
  shape = wgh$parameters[1], 
  scale = wgh$parameters[2] 
)
plot(sampleg, theoreticalg, pch = 16, main = "Q-Q Plot Weibull Glycohemoglobin (MM)")
abline(0,1, col = "red", lwd = 2)

Estimated Median

M <- 5000
N <- 1000

simh <- rweibull(N*M,
                 shape = wht$parameters[1], 
                 scale = wht$parameters[2]) %>% 
  array(dim = c(M,N))
sample_dist <- apply(simh, 1, median)
hist(sample_dist, breaks = 50, main = "Weibull Distribution of Medians \nHeights (MM)", xlab = "Median Height")
abline(v = qweibull(.5,
                    shape = wht$parameters[1], 
                    scale = wht$parameters[2]),
       lwd = 3, col = "blue")

qweibull(.5,
      shape = wht$parameters[1], 
      scale = wht$parameters[2])
## [1] 161.8061
simg <- rweibull(N*M,
                 shape = wgh$parameters[1], 
                 scale = wgh$parameters[2]) %>% 
  array(dim = c(M,N))
sample_dist <- apply(simg, 1, median)
hist(sample_dist, breaks = 50, main = "Weibull Distribution of Medians \nGlycohemoglobin (MM)", xlab = "Median Glycohemoglobin")
abline(v = qweibull(.5,
                    shape = wgh$parameters[1], 
                    scale = wgh$parameters[2]),
       lwd = 3, col = "red")

qweibull(.5,
      shape = wgh$parameters[1], 
      scale = wgh$parameters[2])
## [1] 5.806494

Maximum Likelihood

Normal Distribution

Estimates of parameters

nht <- enorm(d1$ht, method = "mle")
nht$parameters[1]
##     mean 
## 160.7419
nht$parameters[2]
##     sd 
## 7.3165
ngh <- enorm(d1$gh, method = "mle")
ngh$parameters[1]
##   mean 
## 5.7246
ngh$parameters[2]
##      sd 
## 1.05172

Overlay Estimated PDF onto Histogram

hist(d1$ht, breaks = 100, freq = FALSE, main = "Normal Height (MLE)", xlab = "Height")
curve(dnorm(x, mean = nht$parameters[1], 
            sd = nht$parameters[2]), 
            col = "blue", 
            lwd = 6,
            add = TRUE)

hist(d1$gh, breaks = 100, freq = FALSE, main = "Normal Glycohemoglobin (MLE)", xlab = "Glycohemoglobin")
curve(dnorm(x, mean = ngh$parameters[1], 
            sd = ngh$parameters[2]), 
            col = "red", 
            lwd = 6,
            add = TRUE)

Overlay Estimated CDF onto ECDF

plot(ecdf(d1$ht),main = "Normal Height MLE", xlab = "Height")
curve(pnorm(
  x, 
  mean = nht$parameters[1], 
  sd = nht$parameters[2] 
), 
add = TRUE, col = "blue", lwd = 2)

plot(ecdf(d1$gh),main = "Normal Glychohemoglobin MLE", xlab = "Height")
curve(pnorm(
  x, 
  mean = ngh$parameters[1], 
  sd = ngh$parameters[2] 
), 
add = TRUE, col = "red", lwd = 2)

QQ Plot

q_values <- seq(0.05, 0.95, length = 100)

sampleh <- quantile(d1$ht, q_values)
theoreticalh <- qnorm(q_values, 
  mean = nht$parameters[1], 
  sd = nht$parameters[2] 
)
plot(sampleh, theoreticalh, pch = 16, main = "Q-Q Plot Normal Heights (MLE)")
abline(0,1, col = "blue", lwd = 2)

sampleg <- quantile(d1$gh, q_values)
theoreticalg <- qnorm(q_values, 
  mean = ngh$parameters[1], 
  sd = ngh$parameters[2] 
)
plot(sampleg, theoreticalg, pch = 16, main = "Q-Q Plot Normal Glycohemoglobin (MLE)")
abline(0,1, col = "red", lwd = 2)

Estimated Median

M <- 5000
N <- 1000

simh <- rnorm(N*M,
                 mean = nht$parameters[1], 
                 sd = nht$parameters[2]) %>% 
  array(dim = c(M,N))
sample_dist <- apply(simh, 1, median)
hist(sample_dist, breaks = 50, main = "Normal Distribution of Medians \nHeights (MLE)", xlab = "Median Height")
abline(v = qnorm(.5,
                    mean = nht$parameters[1], 
                    sd = nht$parameters[2]),
       lwd = 3, col = "blue")

qnorm(.5,
      mean = nht$parameters[1], 
      sd = nht$parameters[2])
## [1] 160.7419
simg <- rnorm(N*M,
                 mean = ngh$parameters[1], 
                 sd = ngh$parameters[2]) %>% 
  array(dim = c(M,N))
sample_dist <- apply(simg, 1, median)
hist(sample_dist, breaks = 50, main = "Normal Distribution of Medians \nGlycohemoglobin (MLE)", xlab = "Median Glycohemoglobin")
abline(v = qnorm(.5,
                    mean = ngh$parameters[1], 
                    sd = ngh$parameters[2]),
       lwd = 3, col = "red")

qnorm(.5,
      mean = ngh$parameters[1], 
      sd = ngh$parameters[2])
## [1] 5.7246

Gamma Distribution

Estimates of parameters

ght <- egamma(d1$ht, method = "mle")
ght$parameters[1]
##    shape 
## 482.6712
ght$parameters[2]
##     scale 
## 0.3330256
ggh <- egamma(d1$gh, method = "mle")
ggh$parameters[1]
##    shape 
## 40.81761
ggh$parameters[2]
##     scale 
## 0.1402483

Overlay Estimated PDF onto Histogram

hist(d1$ht, breaks = 100, freq = FALSE, main = "Gamma Height (MLE)", xlab = "Height")
curve(dgamma(x, shape = ght$parameters[1], 
            scale = ght$parameters[2]), 
            col = "blue", 
            lwd = 6,
            add = TRUE)

hist(d1$gh, breaks = 100, freq = FALSE, main = "Gamma Glycohemoglobin (MLE)", xlab = "Glycohemoglobin")
curve(dgamma(x, shape = ggh$parameters[1], 
            scale = ggh$parameters[2]), 
            col = "red", 
            lwd = 6,
            add = TRUE)

Overlay Estimated CDF onto ECDF

plot(ecdf(d1$ht),main = "Gamma Height MLE", xlab = "Height")
curve(pgamma(
  x, 
  shape = ght$parameters[1], 
  scale = ght$parameters[2] 
), 
add = TRUE, col = "blue", lwd = 2)

plot(ecdf(d1$gh),main = "Gamma Glychohemoglobin MLE", xlab = "Height")
curve(pgamma(
  x, 
  shape = ggh$parameters[1], 
  scale = ggh$parameters[2] 
), 
add = TRUE, col = "red", lwd = 2)

QQ Plot

q_values <- seq(0.05, 0.95, length = 100)

sampleh <- quantile(d1$ht, q_values)
theoreticalh <- qgamma(q_values, 
  shape = ght$parameters[1], 
  scale = ght$parameters[2] 
)
plot(sampleh, theoreticalh, pch = 16, main = "Q-Q Plot Gamma Heights (MLE)")
abline(0,1, col = "blue", lwd = 2)

sampleg <- quantile(d1$gh, q_values)
theoreticalg <- qgamma(q_values, 
  shape = ggh$parameters[1], 
  scale = ggh$parameters[2] 
)
plot(sampleg, theoreticalg, pch = 16, main = "Q-Q Plot Gamma Glycohemoglobin (MLE)")
abline(0,1, col = "red", lwd = 2)

Estimated Median

M <- 5000
N <- 1000

simh <- rgamma(N*M,
                 shape = ght$parameters[1], 
                 scale = ght$parameters[2]) %>% 
  array(dim = c(M,N))
sample_dist <- apply(simh, 1, median)
hist(sample_dist, breaks = 50, main = "Gamma Distribution of Medians \nHeights (MLE)", xlab = "Median Height")
abline(v = qgamma(.5,
                    shape = ght$parameters[1], 
                    scale = ght$parameters[2]),
       lwd = 3, col = "blue")

qgamma(.5,
      shape = ght$parameters[1], 
      scale = ght$parameters[2])
## [1] 160.6309
simg <- rgamma(N*M,
                 shape = ggh$parameters[1], 
                 scale = ggh$parameters[2]) %>% 
  array(dim = c(M,N))
sample_dist <- apply(simg, 1, median)
hist(sample_dist, breaks = 50, main = "Gamma Distribution of Medians \nGlycohemoglobin (MLE)", xlab = "Median Glycohemoglobin")
abline(v = qgamma(.5,
                    shape = ggh$parameters[1], 
                    scale = ggh$parameters[2]),
       lwd = 3, col = "red")

qgamma(.5,
      shape = ggh$parameters[1], 
      scale = ggh$parameters[2])
## [1] 5.677919

Weibull Distribution

Estimates of parameters

wht <- eweibull(d1$ht, method = "mle")
wht$parameters[1]
##    shape 
## 21.85398
wht$parameters[2]
##    scale 
## 164.2472
wgh <- eweibull(d1$gh, method = "mle")
wgh$parameters[1]
##    shape 
## 4.125254
wgh$parameters[2]
##    scale 
## 6.173884

Overlay Estimated PDF onto Histogram

hist(d1$ht, breaks = 100, freq = FALSE, main = "Weibull Height (MLE)", xlab = "Height")
curve(dweibull(x, shape = wht$parameters[1], 
            scale = wht$parameters[2]), 
            col = "blue", 
            lwd = 6,
            add = TRUE)

hist(d1$gh, breaks = 100, freq = FALSE, main = "Weibull Glycohemoglobin (MLE)", xlab = "Glycohemoglobin")
curve(dweibull(x, shape = wgh$parameters[1], 
            scale = wgh$parameters[2]), 
            col = "red", 
            lwd = 6,
            add = TRUE)

Overlay Estimated CDF onto ECDF

plot(ecdf(d1$ht),main = "Weibull Height MLE", xlab = "Height")
curve(pweibull(
  x, 
  shape = wht$parameters[1], 
  scale = wht$parameters[2] 
), 
add = TRUE, col = "blue", lwd = 2)

plot(ecdf(d1$gh),main = "Weibull Glychohemoglobin MLE", xlab = "Height")
curve(pweibull(
  x, 
  shape = wgh$parameters[1], 
  scale = wgh$parameters[2] 
), 
add = TRUE, col = "red", lwd = 2)

QQ Plot

q_values <- seq(0.05, 0.95, length = 100)

sampleh <- quantile(d1$ht, q_values)
theoreticalh <- qweibull(q_values, 
  shape = wht$parameters[1], 
  scale = wht$parameters[2] 
)
plot(sampleh, theoreticalh, pch = 16, main = "Q-Q Plot Weibull Heights (MLE)")
abline(0,1, col = "blue", lwd = 2)

sampleg <- quantile(d1$gh, q_values)
theoreticalg <- qweibull(q_values, 
  shape = wgh$parameters[1], 
  scale = wgh$parameters[2] 
)
plot(sampleg, theoreticalg, pch = 16, main = "Q-Q Plot Weibull Glycohemoglobin (MLE)")
abline(0,1, col = "red", lwd = 2)

Estimated Median

M <- 5000
N <- 1000

simh <- rweibull(N*M,
                 shape = wht$parameters[1], 
                 scale = wht$parameters[2]) %>% 
  array(dim = c(M,N))
sample_dist <- apply(simh, 1, median)
hist(sample_dist, breaks = 50, main = "Weibull Distribution of Medians \nHeights (MLE)", xlab = "Median Height")
abline(v = qweibull(.5,
                    shape = wht$parameters[1], 
                    scale = wht$parameters[2]),
       lwd = 3, col = "blue")

qweibull(.5,
      shape = wht$parameters[1], 
      scale = wht$parameters[2])
## [1] 161.5156
simg <- rweibull(N*M,
                 shape = wgh$parameters[1], 
                 scale = wgh$parameters[2]) %>% 
  array(dim = c(M,N))
sample_dist <- apply(simg, 1, median)
hist(sample_dist, breaks = 50, main = "Weibull Distribution of Medians \nGlycohemoglobin (MLE)", xlab = "Median Glycohemoglobin")
abline(v = qweibull(.5,
                    shape = wgh$parameters[1], 
                    scale = wgh$parameters[2]),
       lwd = 3, col = "red")

qweibull(.5,
      shape = wgh$parameters[1], 
      scale = wgh$parameters[2])
## [1] 5.64902